Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio
git repo:
import warnings
warnings.filterwarnings('ignore')
# data read and structuring
import pandas as pd
import numpy as np
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# model building
from sklearn.linear_model import LogisticRegression
# data preparing
from sklearn.model_selection import train_test_split
# check error values
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# custom display output
from IPython.display import display, HTML
import scipy.stats as stats
# scalars to normalize and impute
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Scoring mertics
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix
# Imaging tools
from IPython.display import Image
from sklearn import tree
from os import system
%matplotlib inline
sns.set(color_codes=True)
df = pd.read_csv('bank-full.csv')
df.head()
df.shape
df.dtypes
df.describe().transpose()
df.describe(include='all').transpose()
from the initial look at 5-point summary, looks like we need to include all features to decide the outcome (term deposit)
'default', 'housing', 'loan' are discrete binary categorical value and 5 -point summary does not apply but are independent features that will influence the outcome (target: term deposit)
'job', 'marital', 'education', 'contact', 'day', 'month', 'poutcome' are multi-class categorical feature that may influence the outcome
'day' need to be treated as a categorical feature
Contineous features that impact the outcomes are 'age', 'balance', 'duration', 'campaign', 'pdays', 'previous'
target or dependent or outcome variable is 'target'. Since this is a discrete binary categorical value, we will apply 'Classifications'
'balance' have -ve values and shall be treated
'pdays' -1 value will be converted to 999 to have the weighted contineous value
# check null or missing values
df.isnull().sum()
# check duplicate observations
data_dup = df[df.duplicated(keep="last")]
data_dup
# check data types and null missing values
df.info()
# from the 5-point summary, there are some -ve balance. Treat and correct it.
df['balance'] = df['balance'].abs()
# Treat the -1 value in pdays, to get a predictable distribution and inform
# that the customers were really not been contacted for long, replace all -1 with 999
df.pdays[df.pdays == -1] = 999
# create a list of categorical and contineous features
categorical_cols = ['default', 'housing', 'loan', 'job', 'marital',
'education', 'contact', 'day', 'month', 'poutcome']
contineous_cols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
# Value counts of the categorical features to find the data population
for col in categorical_cols:
print(df[col].value_counts())
print()
# Also find the percentage of value count of contineous features to determine the data population we need to process
for col in categorical_cols:
print(df[col].value_counts(normalize=True) * 100)
print()
# check the spread of data to again determine the data population
df.nunique()
# Term deposit count based on educational level
def independent_count_plot(independent):
plt.figure(figsize=(10,7))
plt.xticks(rotation=70)
plt.title('Count of ' + independent + ' feature', fontsize=20)
plt.xlabel(independent, fontsize=10)
plt.ylabel('Count', fontsize=10)
sns.countplot(x=independent, data=df)
plt.show()
# # count of a given independent feature against the dependent feature
# def count_plot(c):
# indep_vs_dep = pd.crosstab(df[independent], df[dependent])
# print("Count:")
# print (indep_vs_dep)
# print()
# print("Percent:")
# print (indep_vs_dep.div(indep_vs_dep.sum(1).astype(float), axis = 0) * 100)
# print()
# plt.figure(figsize=(10,7))
# plt.xticks(rotation=70)
# plt.title(dependent + ' count per ' + independent + ' feature', fontsize=20)
# sns.countplot(x=dependent, hue=independent, data=df);
# plt.show()
# indep_vs_dep.div(indep_vs_dep.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
# count of a given independent feature against the dependent feature
def count_plot(independent, dependent):
indep_vs_dep = pd.crosstab(df[independent], df[dependent])
print("Count:")
print (indep_vs_dep)
print()
print("Percent:")
print (indep_vs_dep.div(indep_vs_dep.sum(1).astype(float), axis = 0) * 100)
print()
plt.figure(figsize=(10,7))
plt.title(dependent + ' count per ' + independent + ' feature', fontsize=20)
sns.countplot(x=dependent, hue=independent, data=df);
plt.show()
indep_vs_dep.div(indep_vs_dep.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
def countplot(label):
plt.figure(figsize=(15,10))
Y = df[label]
total = len(Y)*1.
ax=sns.countplot(x=label, data=df)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
ax.yaxis.set_ticks(np.linspace(0, total, 11))
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
def countplot_withY(label, dependent):
plt.figure(figsize=(20,10))
Y = df[label]
total = len(Y)*1.
ax=sns.countplot(x=label, data=df, hue=dependent)
for p in ax.patches:
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
ax.yaxis.set_ticks(np.linspace(0, total, 11))
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
def univariant_categorical(independent, dependent):
count_plot(independent, dependent)
countplot(independent)
countplot_withY(independent, dependent)
# independent_count_plot(independent)
# Univariant: Personal loan count based on educational level
# categorical_columns = ['Family', 'Education', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
# for col in categorical_columns:
# univariant_categorical(col, 'Personal Loan')
for col in categorical_cols:
univariant_categorical(col, 'Target')
def box_independent_hue(independent):
plt.figure(figsize=(10,7))
sns.boxplot(data=df, x = 'Target', y = independent);
plt.show()
def box_independent(independent):
plt.figure(figsize=(10,7))
sns.boxplot(data=df, x = independent);
plt.show()
def dist_plot(independent, skip_dist_plot=False):
if skip_dist_plot:
return
plt.figure(figsize=(10,7))
sns.distplot(df[independent]);
def dist_plot_hue(independent):
plt.figure(figsize=(10,7))
# sns.countplot(x=independent, hue='Target', data=df);
plt.hist(df[independent], density=True)
def density_plot(independent, dependent):
sns.distplot(df[df[dependent]=='no'][independent],color='r',label=0)
sns.distplot(df[df[dependent]=='yes'][independent],color='g',label=1)
plt.legend()
plt.show()
def univariant_contineous(independent, dependent, skip_dist_plot=False):
# density_plot(independent, dependent)
box_independent(independent)
# box_independent_hue(independent)
dist_plot_hue(independent)
dist_plot(independent, skip_dist_plot)
univariant_contineous('age', 'Target')
univariant_contineous('balance', 'Target')
univariant_contineous('duration', 'Target')
univariant_contineous('campaign', 'Target')
df["previous"].unique()
df["previous"].value_counts()
df[df["Target"]=="yes"]["previous"].value_counts()
df[df["Target"]=="no"]["previous"].value_counts()
univariant_contineous('previous', 'Target', skip_dist_plot=True)
# box_independent('previous')
# box_independent_hue(independent)
# dist_plot_hue(independent)
# dist_plot(independent)
univariant_contineous('pdays', 'Target', skip_dist_plot=True)
def multivariant_contineous(independent, dependent):
density_plot(independent, dependent)
box_independent_hue(independent)
dist_plot_hue(independent)
df[df['pdays'] < 0]
# Treat the -1 value in pdays, to get a predictable distribution and inform
# that the customers were really not been contacted for long, replace all -1 with 999
df.pdays[df.pdays == -1] = 999
df[df['pdays'] < 0].pdays
# Replace unknown values of 'contact' variable proportionately with cellular and telephone
contact_count = df.contact.value_counts()
cell_per = contact_count['cellular'] / (contact_count['cellular'] + contact_count['telephone'])
cell_cnt = int(round(contact_count['unknown'] * cell_per))
contact_ranges = df.contact.values.tolist()
for i in range(len(contact_ranges)):
if contact_ranges[i] == 'unknown':
if cell_cnt > 0:
contact_ranges[i] = 'cellular'
cell_cnt -= 1
else:
contact_ranges[i] = 'telephone'
df['contact'] = contact_ranges
print("unknown contact count: ", df[df.contact == 'unknown'].shape[0])
# Let's look at cross tabulation between job and education
pd.crosstab(df['education'], df['job'])
# Customers with primary and secondary education have more blue-collar jobs; while customers with
# secondary education have more management jobs
df.loc[(df.job == 'unknown') &
((df.education =='primary') | (df.education =='secondary')), 'job'] = 'blue-collar'
df.loc[(df.job == 'unknown') & (df.education == 'tertiary'), 'job'] = 'management'
print("Number of customers left with unknown jobs: ", df[df.job == 'unknown'].shape[0])
# Let's look at the ages for different job categories.
ja_df = df.groupby('job').age.min().to_frame().T
ja_df = ja_df.append(df.groupby('job').age.mean().to_frame().T, ignore_index=True)
ja_df = ja_df.append(df.groupby('job').age.max().to_frame().T, ignore_index=True)
ja_df.index = ['min. age', 'avg. age', 'max. age']
ja_df
# For customers who have unknown education and job values, we can try infering their job based on their ages
# Let's mark all customers above 60 as retired
df.loc[(df.job == 'unknown') & (df.age > 60), 'job'] = 'retired'
# Let's mark all customers between age 15 and 25 as students
df.loc[(df.job == 'unknown') & ((df.age > 15) & (df.age <= 25)), 'job'] = 'student'
# Based on distribution of age alone it's difficult to further infer job values.
# Let's mark the remaining customers (about 115) as unemployed for simplicity.
df.loc[df.job == 'unknown', 'job'] = 'unemployed'
print("unknown jobs count: ", df[df.job == 'unknown'].shape[0])
ea_df = df.groupby('education').age.min().to_frame().T
ea_df = ea_df.append(df.groupby('education').age.mean().to_frame().T, ignore_index=True)
ea_df = ea_df.append(df.groupby('education').age.max().to_frame().T, ignore_index=True)
ea_df.index = ['min. age', 'avg. age', 'max. age']
ea_df
# Get popular education levels per job category
edu_per_job = df.groupby('job').apply(lambda x: x['education'].value_counts().index[0])
edu_job_dict = {x:y for (x,y) in zip(edu_per_job.index, edu_per_job.values)}
# Replace unknowns from education with corresponding popular education level for that job
df['education'] = df.apply(lambda x : edu_job_dict[x['job']] if x['education'] == 'unknown'
else x['education'], axis=1)
print("unknown education levels: ", df[df.education == 'unknown'].shape[0])
cont_cols = ['age', 'balance', 'duration', 'campaign', 'previous', 'pdays']
for col in cont_cols:
print()
print (col, ':', df[col].value_counts())
cont_cols = ['age', 'balance', 'duration', 'campaign', 'previous', 'pdays']
for col in cont_cols:
try:
multivariant_contineous(col, 'Target')
except:
print('Cannot plot for ', col)
plt.figure(figsize=(10,8))
try:
sns.distplot(df[df["Target"]=="yes"]["previous"])
sns.distplot(df[df["Target"]=="no"]["previous"])
except:
print()
plt.show()
df = df.replace({'Target': {'yes': 1, 'no': 0}})
pair_cols = contineous_cols + ['Target']
sns.pairplot(df[pair_cols], hue='Target', diag_kind='hist');
df.corr()
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(), annot=True);
def average_contineous_plot(independent, dependent):
df.groupby(dependent)[independent].mean().plot(kind='bar')
plt.title('Average ' + independent + ' effect on ' + dependent, fontsize=20)
plt.show()
print()
for col in contineous_cols:
average_contineous_plot(col, 'Target')
df.columns
contineous_cols
# Let's look at the skewness of data
skew_df = pd.DataFrame({'Skewness' : [stats.skew(df.age),
stats.skew(df.balance),
stats.skew(df.duration),
stats.skew(df.campaign),
stats.skew(df.pdays),
stats.skew(df.previous)]},
index=['Age','balance','duration','campaign','pdays', 'previous'])
skew_df
df.info()
df['balance'] = df['balance'].abs()
df[df.balance < 0]
data_dup = df[df.duplicated(keep="last")]
data_dup
data_processed = df.copy()
# treat the outliers using RobustScalar
from sklearn import preprocessing
scaler = preprocessing.RobustScaler()
# Get list of independent variables to scale
variables_to_scale = ['age', 'balance', 'duration', 'campaign', 'previous', 'pdays']
data_processed[variables_to_scale] = scaler.fit_transform(data_processed[variables_to_scale])
data_processed
categorical_feature_mask = df.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = df.columns[categorical_feature_mask].tolist()
categorical_cols
categorical_cols.append('day')
categorical_cols
for col in categorical_cols:
data_processed[col] = data_processed[col].astype('category')
data_processed.dtypes
data_processed = pd.get_dummies(data_processed, drop_first=True)
# data_processed.rename(columns={'Target_yes': 'Target'}, inplace=True)
from sklearn.model_selection import train_test_split
pdata = data_processed.copy()
pdata
X = pdata.drop('Target',axis=1) # Predictor feature columns
y = pdata['Target'] # Predicted class (1=True, 0=False) (1 X m)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.head()
X_train.columns
## Helping function to calculate the scores
def print_model_score(model, y_predict):
print("Training accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
We will use the sklearn library to build the model and make predictions
#Build the logistic regression model
import statsmodels.api as sm
logit = sm.Logit(y_train, sm.add_constant(X_train))
lg = logit.fit()
#Summary of logistic regression
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(lg.summary())
#Calculate Odds Ratio, probability
##create a data frame to collate Odds ratio, probability and p-value of the coef
lgcoef = pd.DataFrame(lg.params, columns=['coef'])
lgcoef.loc[:, "Odds_ratio"] = np.exp(lgcoef.coef)
lgcoef['probability'] = lgcoef['Odds_ratio']/(1+lgcoef['Odds_ratio'])
lgcoef['pval']=lg.pvalues
pd.options.display.float_format = '{:.2f}'.format
# FIlter by significant p-value (pval <0.1) and sort descending by Odds ratio
lgcoef = lgcoef.sort_values(by="Odds_ratio", ascending=False)
pval_filter = lgcoef['pval']<=0.1
lgcoef[pval_filter]
We will use the sklearn library to build the model and make predictions
# from sklearn import metrics
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score, classification_report
# # Fit the model on train
# lg_clf = LogisticRegression(random_state=42)
# lg_clf.fit(X_train, y_train)
col_types = [('classifier', str),
('train score', np.float128),
('test score', np.float128),
('recall score', np.float128),
('precision score', (np.float128)),
('F1 score', np.float128),
('roc auc score', np.float128)]
df_compare_model = pd.DataFrame({k: pd.Series(dtype=t) for k, t in col_types})
df_compare_model.style.format({'train score': '{:.6f}', 'test score': '{:.6f}', 'recall score': '{:.6f}',
'precision score': '{:.6f}', 'F1 score': '{:.6f}', 'roc auc score': '{:.6f}'})
## function to get confusion matrix in a proper format
def draw_cm( actual, predicted):
cm = confusion_matrix( actual, predicted)
print (cm)
# print('The confusion matrix')
print('')
print ('True Positives (TP): we correctly predicted that they will take personal loans ', cm[1, 1])
print('')
print ('True Negatives (TN): we correctly predicted that they will not take personal loan ', cm[0, 0])
print('')
print ('False Positives (FP): we incorrectly predicted that they will take personal loan (a "Type I error") ', cm[0, 1], ' Falsely predict positive Type I error')
print('')
print ('False Negatives (FN): we incorrectly predicted that they will not take personal loans (a "Type II error")', cm[1, 0], ' Falsely predict negative Type II error')
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
plt.ylabel('Observed')
plt.xlabel('Predicted')
plt.show()
from decimal import *
getcontext().prec = 7
def print_model_score(model, classifier):
global df_compare_model
predicted = model.predict(X_test)
print("Training accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,predicted))
print()
print("Recall:",recall_score(y_test,predicted))
print()
print("Precision:",precision_score(y_test,predicted))
print()
print("F1 Score:",f1_score(y_test,predicted))
print()
print("Roc Auc Score:",roc_auc_score(y_test,predicted))
new_row = {'classifier':classifier,
'train score': Decimal(model.score(X_train,y_train)),
'test score': Decimal(model.score(X_test, y_test)),
'recall score': Decimal(recall_score(y_test,predicted)),
'precision score': Decimal(precision_score(y_test,predicted)),
'F1 score': Decimal(f1_score(y_test,predicted)),
'roc auc score': Decimal(roc_auc_score(y_test,predicted))}
# print (new_row)
df_compare_model = df_compare_model.append([new_row], ignore_index=True)
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
def print_model_roc(model):
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.6f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
from yellowbrick.classifier import ClassificationReport, ROCAUC
def visClassifierResults(model_w_parameters):
viz = ClassificationReport(model_w_parameters)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(model_w_parameters)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
def draw_tree(dtree_clf, xtr, ytr, dot_file, image_file, class_names, feature_names):
''' Method to draw decision tree.
dtree_clf - Decition Tree Classifier instance
xtr - train data
ytr - train labels
dot_file - dot file name with .dot extension
iamge_file - image file name with .png extension
class_names - labels
feature_names - list of independent features'''
dtree_clf.fit(xtr, ytr)
tree_file = open(dot_file,'w')
dot_data = tree.export_graphviz(dtree_clf,
out_file = tree_file,
feature_names = feature_names,
class_names = class_names,
filled=True, rounded=True, special_characters=True)
tree_file.close()
# Works only if "dot" command works on your machine
command = "dot -Tpng {0} -o {1}".format(dot_file, image_file)
retCode = system(command)
if retCode > 0:
print("Error while diplaying tree! error code: " + str(retCode))
else:
display(Image(image_file))
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score, classification_report
# Fit the model on train
lg_clf = LogisticRegression(random_state=42)
lg_clf.fit(X_train, y_train)
print_model_score(lg_clf, 'LogisticRegression')
df_compare_model
# df_compare_model.round(16)
# df_compare_model["train score"] = df_compare_model["train score"].apply(lambda x: (np.float128)(x*10000000000000000)/10000000000000000)
# df_compare_model.round(decimals=16)
# roundTwoDecimals = np.round(df_compare_model, decimals=6)
# roundTwoDecimals
print_model_roc(lg_clf)
print(classification_report(y_test, lg_clf.predict(X_test)))
visClassifierResults(lg_clf)
We will build our model using the DecisionTreeClassifier function. Using default 'gini' criteria to split. Other option include 'entropy'.
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=42)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
train_char_label = ['No', 'Yes']
Credit_Tree_File = open('credit_tree.dot','w')
dot_data = tree.export_graphviz(dTree, out_file=Credit_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Credit_Tree_File.close()
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng credit_tree.dot -o credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("credit_tree.png"))
print_model_score(dTree, "Decision Tree")
df_compare_model
print_model_roc(dTree)
visClassifierResults(dTree)
# Let's look at the feature importance
imp_features = pd.DataFrame(dTree.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by='Imp', ascending=False)
imp_features[imp_features.Imp >= 0.01].sort_values(by='Imp', ascending=False)
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, random_state=42)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
# train_char_label = ['No', 'Yes']
# Credit_Tree_FileR = open('credit_treeR.dot','w')
# dot_data = tree.export_graphviz(dTreeR, out_file=Credit_Tree_FileR, feature_names = list(X_train), class_names = list(train_char_label))
# Credit_Tree_FileR.close()
# #Works only if "dot" command works on you machine
# retCode = system("dot -Tpng credit_treeR.dot -o credit_treeR.png")
# if(retCode>0):
# print("system command returning error: "+str(retCode))
# else:
# display(Image("credit_treeR.png"))
# Draw Decision tree
draw_tree(dTreeR, X_train, y_train, 'dtree_model.dot', 'dtree_model.png', ['No', 'Yes'], list(X_train))
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
# Let's look at the feature importance
imp_features = pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values(by='Imp', ascending=False)
imp_features[imp_features.Imp > 0.001].sort_values(by='Imp', ascending=False)
print_model_score(dTreeR, "Decision Tree Regressor")
df_compare_model
The confusion matrix
True Positives (TP): we correctly predicted that they will take personal loans 471
True Negatives (TN): we correctly predicted that they will not take personal loan 11709
False Positives (FP): we incorrectly predicted that they will take personal loan (a "Type I error") 257 Falsely predict positive Type I error
False Negatives (FN): we incorrectly predicted that they will not take personal loans (a "Type II error") 1127 Falsely predict negative Type II error
print_model_roc(dTreeR)
visClassifierResults(dTreeR)
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=42)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
# y_predict = bgcl.predict(X_test)
print_model_score(bgcl, "Bagging")
df_compare_model
print_model_roc(bgcl)
visClassifierResults(bgcl)
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=42)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)
print_model_score(abcl, 'Adaptive Boosting')
df_compare_model
print_model_roc(abcl)
visClassifierResults(abcl)
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=42)
gbcl = gbcl.fit(X_train, y_train)
print_model_score(gbcl, 'Gradient Boost')
print_model_roc(gbcl)
visClassifierResults(gbcl)
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 100, random_state=42)
# rfcl = RandomForestClassifier(n_estimators = 50, random_state=1, max_features=12)
rfcl = rfcl.fit(X_train, y_train)
print_model_score(rfcl, "Random Forest")
print_model_roc(rfcl)
visClassifierResults(rfcl)
df_compare_model
Predicted that customer who are interested in opening term deposit account will be approached
Here the bank wants to reach people who will open term deposit but our model predicted they will not take loan i.e. less number of False Negative, if FN is high, bank would lose on prospect customers. So that the bank doesn't lose money who are willing to open term deposit. Hence Recall is the important metric.
In case of False positive, bank will lose effort to reach out to a few people but thats okay because the bank thought that these people will open term deposit but they did not take. This number is quite low i.e. 288 for Random Forest and 426 for Bagging and the precision percent is quite good 68% for Random Forest as compared to 61% for Bagging.